Analysing Data


In [114]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [101]:
test=pd.read_csv("test.csv")
test.head() #visualising last 10 data
# print g_model.loc[417,"Survived"] #individual visualisation


Out[101]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S

In [112]:
mData=pd.read_csv("train.csv")
mData.head()


Out[112]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 NaN S

In [103]:
mData.info()
print "_________________________________________________"
test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 73.1+ KB
_________________________________________________
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 31.0+ KB

Drop Non Required Coloumn


In [113]:
mData.drop(["PassengerId","Name","Ticket"], axis=1)
test.drop(["Name","Ticket"], axis=1)


Out[113]:
PassengerId Pclass Sex Age SibSp Parch Fare Cabin Embarked
0 892 3 male 34.5 0 0 7.8292 NaN Q
1 893 3 female 47.0 1 0 7.0000 NaN S
2 894 2 male 62.0 0 0 9.6875 NaN Q
3 895 3 male 27.0 0 0 8.6625 NaN S
4 896 3 female 22.0 1 1 12.2875 NaN S
5 897 3 male 14.0 0 0 9.2250 NaN S
6 898 3 female 30.0 0 0 7.6292 NaN Q
7 899 2 male 26.0 1 1 29.0000 NaN S
8 900 3 female 18.0 0 0 7.2292 NaN C
9 901 3 male 21.0 2 0 24.1500 NaN S
10 902 3 male NaN 0 0 7.8958 NaN S
11 903 1 male 46.0 0 0 26.0000 NaN S
12 904 1 female 23.0 1 0 82.2667 B45 S
13 905 2 male 63.0 1 0 26.0000 NaN S
14 906 1 female 47.0 1 0 61.1750 E31 S
15 907 2 female 24.0 1 0 27.7208 NaN C
16 908 2 male 35.0 0 0 12.3500 NaN Q
17 909 3 male 21.0 0 0 7.2250 NaN C
18 910 3 female 27.0 1 0 7.9250 NaN S
19 911 3 female 45.0 0 0 7.2250 NaN C
20 912 1 male 55.0 1 0 59.4000 NaN C
21 913 3 male 9.0 0 1 3.1708 NaN S
22 914 1 female NaN 0 0 31.6833 NaN S
23 915 1 male 21.0 0 1 61.3792 NaN C
24 916 1 female 48.0 1 3 262.3750 B57 B59 B63 B66 C
25 917 3 male 50.0 1 0 14.5000 NaN S
26 918 1 female 22.0 0 1 61.9792 B36 C
27 919 3 male 22.5 0 0 7.2250 NaN C
28 920 1 male 41.0 0 0 30.5000 A21 S
29 921 3 male NaN 2 0 21.6792 NaN C
... ... ... ... ... ... ... ... ... ...
388 1280 3 male 21.0 0 0 7.7500 NaN Q
389 1281 3 male 6.0 3 1 21.0750 NaN S
390 1282 1 male 23.0 0 0 93.5000 B24 S
391 1283 1 female 51.0 0 1 39.4000 D28 S
392 1284 3 male 13.0 0 2 20.2500 NaN S
393 1285 2 male 47.0 0 0 10.5000 NaN S
394 1286 3 male 29.0 3 1 22.0250 NaN S
395 1287 1 female 18.0 1 0 60.0000 C31 S
396 1288 3 male 24.0 0 0 7.2500 NaN Q
397 1289 1 female 48.0 1 1 79.2000 B41 C
398 1290 3 male 22.0 0 0 7.7750 NaN S
399 1291 3 male 31.0 0 0 7.7333 NaN Q
400 1292 1 female 30.0 0 0 164.8667 C7 S
401 1293 2 male 38.0 1 0 21.0000 NaN S
402 1294 1 female 22.0 0 1 59.4000 NaN C
403 1295 1 male 17.0 0 0 47.1000 NaN S
404 1296 1 male 43.0 1 0 27.7208 D40 C
405 1297 2 male 20.0 0 0 13.8625 D38 C
406 1298 2 male 23.0 1 0 10.5000 NaN S
407 1299 1 male 50.0 1 1 211.5000 C80 C
408 1300 3 female NaN 0 0 7.7208 NaN Q
409 1301 3 female 3.0 1 1 13.7750 NaN S
410 1302 3 female NaN 0 0 7.7500 NaN Q
411 1303 1 female 37.0 1 0 90.0000 C78 Q
412 1304 3 female 28.0 0 0 7.7750 NaN S
413 1305 3 male NaN 0 0 8.0500 NaN S
414 1306 1 female 39.0 0 0 108.9000 C105 C
415 1307 3 male 38.5 0 0 7.2500 NaN S
416 1308 3 male NaN 0 0 8.0500 NaN S
417 1309 3 male NaN 1 1 22.3583 NaN C

418 rows × 9 columns

Plotting Some data


In [128]:
sns.factorplot('Embarked','Survived', data=mData,size=4,aspect=3)

#divide screen in 3
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
sns.countplot(x='Embarked', data=mData, ax=axis1)
sns.countplot(x='Survived', hue="Embarked", data=mData, order=[1,0], ax=axis2)

#Below Is Wow Feature
# group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = mData[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)


Out[128]:
<matplotlib.axes.AxesSubplot at 0x91999d0>

#Apply some priproccesing

Fare


In [130]:
# only for test, since there is a missing "Fare" values
test["Fare"].fillna(test["Fare"].median(), inplace=True) #replace missing value by median

# convert from float to int
mData['Fare'] = mData['Fare'].astype(int)
test['Fare']    = test['Fare'].astype(int)

# get fare for survived & didn't survive passengers 
fare_not_survived = mData["Fare"][mData["Survived"] == 0]
fare_survived     = mData["Fare"][mData["Survived"] == 1]

# get average and std for fare of survived/not survived passengers
avgerage_fare = pd.DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare      = pd.DataFrame([fare_not_survived.std(), fare_survived.std()])

In [138]:
# plot
mData['Fare'].plot(kind='hist', figsize=(15,3),bins=100, xlim=(0,80))


Out[138]:
<matplotlib.axes.AxesSubplot at 0xbff3e70>

In [143]:
avgerage_fare.index.names = std_fare.index.names = ["Survived"]
avgerage_fare


Out[143]:
0
Survived
0 21.690346
1 47.991228

In [147]:
avgerage_fare.plot(yerr=std_fare,kind='bar',legend=False)


Out[147]:
<matplotlib.axes.AxesSubplot at 0xc667470>

Age


In [150]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age values - Titanic')
axis2.set_title('New Age values - Titanic')


# get average, std, and number of NaN values in titanic_df
average_age_titanic   = mData["Age"].mean()
std_age_titanic       = mData["Age"].std()
count_nan_age_titanic = mData["Age"].isnull().sum()

# get average, std, and number of NaN values in test
average_age_test   = test["Age"].mean()
std_age_test       = test["Age"].std()
count_nan_age_test = test["Age"].isnull().sum()

# convert from float to int
mData['Age'] = mData['Age'].astype(int)
test['Age']    = test['Age'].astype(int)

# plot original Age values
mData['Age'].hist(bins=70, ax=axis1)
  
# generate random numbers between (mean - std) & (mean + std) ## WOW
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)

# fill NaN values in Age column with random values generated
mData["Age"][np.isnan(mData["Age"])] = rand_1
test["Age"][np.isnan(test["Age"])] = rand_2


      
# plot new Age Values
mData['Age'].hist(bins=70, ax=axis2)


c:\python27\lib\site-packages\ipykernel\__main__.py:29: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
c:\python27\lib\site-packages\ipykernel\__main__.py:30: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[150]:
<matplotlib.axes.AxesSubplot at 0xd292170>

In [151]:
# .... continue with plot Age column

# peaks for survived/not survived passengers by their age
facet = sns.FacetGrid(mData, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, mData['Age'].max()))
facet.add_legend()

# average survived passengers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_age = mData[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age)


Out[151]:
<matplotlib.axes.AxesSubplot at 0xd6de190>

Family


In [152]:
# Family

# Instead of having two columns Parch & SibSp, 
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
mData['Family'] =  mData['Parch'] + mData['SibSp']

mData['Family'].loc[mData['Family'] > 0] = 1
mData['Family'].loc[mData['Family'] == 0] = 0

test['Family'] =  test['Parch'] + test['SibSp']
test['Family'].loc[test['Family'] > 0] = 1
test['Family'].loc[test['Family'] == 0] = 0

# drop Parch & SibSp
mData = mData.drop(['SibSp','Parch'], axis=1)
test    = test.drop(['SibSp','Parch'], axis=1)

# plot
fig, (axis1,axis2) = plt.subplots(1,2,sharex=True,figsize=(10,5))

# sns.factorplot('Family',data=titanic_df,kind='count',ax=axis1)
sns.countplot(x='Family', data=mData, order=[1,0], ax=axis1)

# average of survived for those who had/didn't have any family member
family_perc = mData[["Family", "Survived"]].groupby(['Family'],as_index=False).mean()
sns.barplot(x='Family', y='Survived', data=family_perc, order=[1,0], ax=axis2)

axis1.set_xticklabels(["With Family","Alone"], rotation=0)


c:\python27\lib\site-packages\pandas\core\indexing.py:117: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
Out[152]:
[<matplotlib.text.Text at 0xda7d530>, <matplotlib.text.Text at 0xda91ab0>]

Sex


In [153]:
# Sex

# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
    age,sex = passenger
    return 'child' if age < 16 else sex
    
mData['Person'] = mData[['Age','Sex']].apply(get_person,axis=1)
test['Person']    = test[['Age','Sex']].apply(get_person,axis=1)

# No need to use Sex column since we created Person column
mData.drop(['Sex'],axis=1,inplace=True)
test.drop(['Sex'],axis=1,inplace=True)

# create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
person_dummies_titanic  = pd.get_dummies(mData['Person'])
person_dummies_titanic.columns = ['Male','Female','Child']
person_dummies_titanic.drop(['Male'], axis=1, inplace=True)

person_dummies_test  = pd.get_dummies(test['Person'])
person_dummies_test.columns = ['Male','Female','Child']
person_dummies_test.drop(['Male'], axis=1, inplace=True)

mData = mData.join(person_dummies_titanic)
test    = test.join(person_dummies_test)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5))

# sns.factorplot('Person',data=titanic_df,kind='count',ax=axis1)
sns.countplot(x='Person', data=mData, ax=axis1)

# average of survived for each Person(male, female, or child)
family_perc = mData[["Person", "Survived"]].groupby(['Person'],as_index=False).mean()
sns.barplot(x='Person', y='Survived', data=family_perc, ax=axis2, order=['male','female','child'])

mData.drop(['Person'],axis=1,inplace=True)
test.drop(['Person'],axis=1,inplace=True)



In [ ]: